Introduction

Yi & Bourzikas specializes in talent management solutions for Fortune 1000 companies focus on building and developing strategies for retaining employees. We specialize in workforce planning, employee training programs, identifying high-potential employees and reducing/preventing voluntary employee turnover (attrition). As part of this engagement, our data science team will predict for your organization.

employee <- read.csv("CaseStudy2-data.csv", na.strings = "NULL")
employeeValidation <- read.csv("CaseStudy2Validation.csv", na.strings = "NULL")
result <-rbind(employee,employeeValidation)
#Create 1/0 from Catagorical Variables
emp_train <- fastDummies::dummy_cols(employee) # Create Dummy Variables
emp_test <- fastDummies::dummy_cols(employeeValidation) # Create Dummy Variables
emp_result <- rbind(emp_test, emp_test) # combine train and test data sets
# Creating Variables
# Define Data Colums to Make it Easier
cols.Base <- c(2:36)
cols.CatAttr <- c(38:39)
cols.CatAll <- c(40:68)
col.NoJobRole <- c(1,2,5,7,8,10,12,14,15,18,20,21,22,25:36,40:42,52:53,63:68)
names(emp_result[,c(col.NoJobRole)])
##  [1] "ID"                               "Age"                             
##  [3] "DailyRate"                        "DistanceFromHome"                
##  [5] "Education"                        "EmployeeCount"                   
##  [7] "EnvironmentSatisfaction"          "HourlyRate"                      
##  [9] "JobInvolvement"                   "JobSatisfaction"                 
## [11] "MonthlyIncome"                    "MonthlyRate"                     
## [13] "NumCompaniesWorked"               "PercentSalaryHike"               
## [15] "PerformanceRating"                "RelationshipSatisfaction"        
## [17] "StandardHours"                    "StockOptionLevel"                
## [19] "TotalWorkingYears"                "TrainingTimesLastYear"           
## [21] "WorkLifeBalance"                  "YearsAtCompany"                  
## [23] "YearsInCurrentRole"               "YearsSinceLastPromotion"         
## [25] "YearsWithCurrManager"             "BusinessTravel_Travel_Rarely"    
## [27] "BusinessTravel_Travel_Frequently" "BusinessTravel_Non-Travel"       
## [29] "Gender_Female"                    "Gender_Male"                     
## [31] "MaritalStatus_Married"            "MaritalStatus_Divorced"          
## [33] "MaritalStatus_Single"             "Over18_Y"                        
## [35] "OverTime_No"                      "OverTime_Yes"
# Removed 17 From Data Set
cols.RemoveJobRoleCat <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,20,21,22,66,24,25,26,27,28,29,30,31,32,33,34,35,36)
# All Job Detailed Roles
cols.JobRoles <- c(54:62)
cols.AllButAttr <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,25,26,27,28,29,30,31,32,33,34,35,36,40,41,42,43,44,45,46,47,48,49,50,51,52,53,63,64,65,66,67,68)
# This is all the Catagorical Fields
cols.CatGLM <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,66,24,25,26,27,28,29,30,31,32,33,34,35,36)
cols.CatKNN <- c(1,2,3,5,7,8,10,11,12,14,15,16,18,20,21,22,25,26,27,28,29,30,31,32,33,34,35,36,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68)
cols.NumericAll <- c(1,2,5,7,8,10,11,12,14,15,16,18,20,21,22,25,26,27,28,29,30,31,32,33,34,35,36,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68)
cols.Attrition <- 34
cols.KeyFieldsBaseModel <- c(40:42,7,12,63:65,22,67:68,27,30,31:36)

We received the data files from your employee database and have outlined some key highlights. The following charts are part of our exploratory data and will give your organization an idea of how different features in the data set apply. The pair plots show all the variables based on whether your employees have left the organisation.

# Basic EDA
#EDA - Exploratory Not for Report
pairs(emp_result[,c(2:5)], col=emp_train$Attrition)

pairs(emp_result[,c(6:10)], col=emp_train$Attrition)

pairs(emp_result[,c(11:15)], col=emp_train$Attrition)

pairs(emp_result[,c(16:20)], col=emp_train$Attrition)

pairs(emp_result[,c(21:25)], col=emp_train$Attrition)

pairs(emp_result[,c(26:30)], col=emp_train$Attrition)

pairs(emp_result[,c(31:35)], col=emp_train$Attrition)

pairs(emp_result[,c(36:40)], col=emp_train$Attrition)

Heat Map Charts

Because of the data that we were able to analyze as part of the Par Plots above, we developed 2 Heat Maps and Correlations and Distribution Matrix to take a deeper dive in the data set.

# Heat Map for All Fields
employeeHeatMap <- round(cor(emp_result[,c(cols.NumericAll)]),2)
## Warning in cor(emp_result[, c(cols.NumericAll)]): the standard deviation is
## zero
melted_employeeHeatMap <- melt(employeeHeatMap)
ggplot(data = melted_employeeHeatMap, aes(x=X1, y=X2, fill=value)) + 
  theme(axis.text.x  = element_blank(),axis.ticks.x=element_blank(),axis.title.x=element_blank(),axis.text.y  = element_text(size = 7))+geom_tile()

#ggsave("images/employeeHeatMap.png",plot = last_plot(), type = png())
# Heat Map for Key Sign Fields
employeeHeatMapSig <- round(cor(emp_result[,c(cols.KeyFieldsBaseModel)]),2)
melted_employeeHeatMapSig <- melt(employeeHeatMapSig)
ggplot(data = melted_employeeHeatMapSig, aes(x=X1, y=X2, fill=value)) + 
  theme(axis.text.x  = element_blank(),axis.ticks.x=element_blank(),axis.title.x=element_blank(),axis.text.y  = element_text(size = 7))+
  geom_tile()

#ggsave("images/employeeHeatMapSig.png",plot = last_plot(), type = png())
# EDA For Key Sign Fields on Attrition for Overall Model
ggkeySignPairs <- ggpairs(
  mapping = ggplot2::aes(color = emp_result$Attrition),
  emp_result[,c(cols.KeyFieldsBaseModel)], 
  diag=list(continuous="densityDiag", discrete="barDiag"), 
  axisLabels="show") + theme_minimal()
#ggsave("ggkeySignPairs.png",plot = last_plot(), type = png())

Signficant Key Factor attributing to Attirtion

Showing the significance, p-values, for each variable. This is the base information to understand the key contributors that affect attrition. This data will be used through our data science work so we can utilize the following variables for the models we build.

The following table outlines the top significant factors contributing to attrition:

#TrainDataSet
glm_modeltrain <- glm(emp_train$Attrition~.,emp_train[,c(cols.CatGLM)], family = binomial) # glm train
model_Train = data.frame(coef(summary(glm_modeltrain))[,4]) # pvalue from glm train
names(model_Train) = "Logistic Regressio on Training Set" # title 
#TestDataSet
glm_modeltest <- glm(emp_test$Attrition~.,emp_test[,c(cols.CatGLM)], family = binomial) # glm test
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model_Test = data.frame(coef(summary(glm_modeltest))[,4]) # pvalue from glm test
names(model_Test) = "Logistic Regression Test on Test" # title
#AllData
glm_modelAll <- glm(emp_result$Attrition~.,emp_result[,c(cols.CatGLM)], family = binomial) # glm for all combined test and train data set
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model_All = data.frame(coef(summary(glm_modelAll))[,4]) # pvalue fro combined data set
names(model_All) = "Logistic Regressio on All Data" # title
# Table consolidated
GLM_dataset <-cbind(model_Train, model_Test,model_All) # consolidated train, test and all data set
# Creating kable table for GLM dataset
GLM_dataset  %>%  kable() %>% kable_styling(bootstrap_options = "striped", full_width = F) %>% scroll_box(width = "600px", height = "450px")
Logistic Regressio on Training Set Logistic Regression Test on Test Logistic Regressio on All Data
(Intercept) 0.9856239 0.9989900 0.9985716
ID 0.7413226 0.5564256 0.4055318
Age 0.2081840 0.0156765 0.0006326
BusinessTravelTravel_Frequently 0.0000674 0.0010245 0.0000034
BusinessTravelTravel_Rarely 0.0257045 0.0164520 0.0006930
DailyRate 0.1137743 0.0637612 0.0087484
DepartmentResearch & Development 0.9840834 0.9999149 0.9998797
DepartmentSales 0.9855939 0.9982575 0.9975358
DistanceFromHome 0.0003241 0.0022409 0.0000154
Education 0.8266214 0.1306817 0.0325598
EducationFieldLife Sciences 0.9750329 0.3202993 0.1598708
EducationFieldMarketing 0.8437994 0.5521055 0.4004024
EducationFieldMedical 0.9098803 0.2537434 0.1065171
EducationFieldOther 0.6960025 0.9285191 0.8990460
EducationFieldTechnical Degree 0.4683605 0.4508936 0.2863276
EmployeeNumber 0.9027209 0.4249982 0.2592223
EnvironmentSatisfaction 0.0000007 0.0800222 0.0132989
GenderMale 0.0660375 0.1909220 0.0643729
HourlyRate 0.2337414 0.3492105 0.1855442
JobInvolvement 0.0000060 0.8371013 0.7712302
JobLevel 0.3421974 0.0316539 0.0023752
JobRoleHuman Resources 0.9829785 0.9981111 0.9973286
JobRoleLaboratory Technician 0.0127075 0.9921081 0.9888393
JobRoleManager 0.8653399 0.9989564 0.9985241
JobRoleManufacturing Director 0.8918567 0.9931527 0.9903166
JobRoleResearch Director 0.0349614 0.9909768 0.9872395
JobRoleResearch Scientist 0.8565130 0.9923366 0.9891625
JobRoleSales Executive 0.1372067 0.9999902 0.9999861
JobRoleSales Representative 0.0299926 0.9995330 0.9993395
JobSatisfaction 0.0000192 0.0068582 0.0001315
MaritalStatusMarried 0.0601860 0.4269320 0.2612143
MaritalStatusSingle 0.0008135 0.1077738 0.0229388
MonthlyIncome 0.2085435 0.0074394 0.0001536
MonthlyRate 0.4484720 0.0070837 0.0001399
NumCompaniesWorked 0.0000051 0.0129331 0.0004395
OverTimeYes 0.0000000 0.0000101 0.0000000
PercentSalaryHike 0.6585720 0.5792289 0.4329243
PerformanceRating 0.7420886 0.0477017 0.0051076
RelationshipSatisfaction 0.0363799 0.0009600 0.0000030
StockOptionLevel 0.2540503 0.7718614 0.6817778
TotalWorkingYears 0.0141019 0.6265806 0.4914031
TrainingTimesLastYear 0.0335913 0.2194417 0.0824447
WorkLifeBalance 0.0006481 0.7803786 0.6933455
YearsAtCompany 0.0107927 0.1789009 0.0573137
YearsInCurrentRole 0.0034318 0.0492443 0.0054190
YearsSinceLastPromotion 0.0000925 0.0064181 0.0001159
YearsWithCurrManager 0.0131109 0.0292065 0.0020427

Basic Regrewssion for each Job Role

Learning about any job role specific trends that may exist in the data set is key because it tells us which variables are significant by job. This data can be used to identify what affects will contribute to attrition rate by job. Any value that is < 0.5 is significant

From the tables below, each job description will show the key attributes for attrition:

# Glm for Job role - Human Resources
glm_model_JobRoleHR <- glm(emp_result$`JobRole_Human Resources`~.,emp_result[,c(col.NoJobRole)], family = binomial) # glm
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
JobRoleHR = data.frame(coef(summary(glm_model_JobRoleHR))[,4]) # pulling only pvalue from the glm
names(JobRoleHR) = "Human Resources" # creating title
# Glm for Job role - Manufactoring Director
glm_model_JobRoleManufactoring <- glm(emp_result$`JobRole_Manufacturing Director`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
JobRoleManufactoring = data.frame(coef(summary(glm_model_JobRoleManufactoring))[,4])# pulling only pvalue from the glm
names(JobRoleManufactoring) = "Manufacturing Director" # creating title
# Glm for Job role - Research Scientist
glm_model_JobRoleResearch <- glm(emp_result$`JobRole_Research Scientist`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
JobRoleResearch = data.frame(coef(summary(glm_model_JobRoleResearch))[,4])# pulling only pvalue from the glm
names(JobRoleResearch) = "Research Scientist" # creating title
# Glm for Job role - Lab Tech
glm_model_JobRoleLab <- glm(emp_result$`JobRole_Laboratory Technician`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
JobRoleLab = data.frame(coef(summary(glm_model_JobRoleLab))[,4])# pulling only pvalue from the glm
names(JobRoleLab) = "Laboratory Technician" # creating title
# Glm for Job role - Research Director
glm_model_JobRoleResearchDirector <- glm(emp_result$`JobRole_Research Director`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
JobRoleRD = data.frame(coef(summary(glm_model_JobRoleResearchDirector))[,4])# pulling only pvalue from the glm
names(JobRoleRD) = "Research Director" # creating title
# Glm for Job role - Sales Exec
glm_model_JobRoleSalesExec <- glm(emp_result$`JobRole_Sales Executive`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
JobRoleSE = data.frame(coef(summary(glm_model_JobRoleSalesExec))[,4])# pulling only pvalue from the glm
names(JobRoleSE) = "Sales Executive" # creating title
# Glm for Job role - Sales Person
glm_model_JobRoleSalesPerson <- glm(emp_result$`JobRole_Sales Representative`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
JobRoleSP = data.frame(coef(summary(glm_model_JobRoleSalesPerson))[,4])# pulling only pvalue from the glm
names(JobRoleSP) = "Sales Representative" # creating title
# Glm for Job role - Manager
glm_model_JobRoleManager <- glm(emp_result$JobRole_Manager~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
JobRoleManager = data.frame(coef(summary(glm_model_JobRoleManager))[,4])# pulling only pvalue from the glm
names(JobRoleManager) = "Manager" # creating title
# Glm for Job role - HealthCare
glm_model_JobRoleHealth <- glm(emp_result$`JobRole_Healthcare Representative`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
JobRoleHealthR = data.frame(coef(summary(glm_model_JobRoleHealth))[,4])# pulling only pvalue from the glm
names(JobRoleHealthR) = "Healthcare Representative" # creating title
# Gener by Job Role
glm_model_Gender  <- glm(emp_train$Gender~.,emp_train[,c(cols.JobRoles)], family = binomial)# glm
Gender_Model = data.frame(coef(summary(glm_model_Gender))[,4])# pulling only pvalue from the glm
names(Gender_Model) = "Gender" # creating title
# Marital Status by Role
glm_model_Marital  <- glm(emp_train$MaritalStatus~.,emp_train[,c(cols.JobRoles)], family = binomial)# glm
Marital_Model = data.frame(coef(summary(glm_model_Marital))[,4])# pulling only pvalue from the glm
names(Marital_Model) = "Marital Status" # creating title
# Consolidated all the job role glm
Table.glm <-cbind(JobRoleHR, JobRoleManufactoring,JobRoleResearch,JobRoleLab,JobRoleRD,JobRoleSE,JobRoleManager)
# kable output for the consolidated glm
Table.glm  %>%  kable() %>% kable_styling(bootstrap_options = "striped", full_width = F) %>% scroll_box(width = "800px", height = "450px")
Human Resources Manufacturing Director Research Scientist Laboratory Technician Research Director Sales Executive Manager
(Intercept) 0.9898902 0.8562672 0.0105992 0.7634204 0.9978978 0.0476180 0.9978271
ID 0.7800491 0.0850500 0.0037740 0.3127304 0.9980976 0.1538642 0.9998129
Age 0.5623287 0.0011058 0.1447794 0.0967646 0.9980432 0.0913818 0.9981136
DailyRate 0.0566140 0.1108373 0.4081995 0.2371188 0.9968457 0.5493793 0.9956510
DistanceFromHome 0.8972205 0.9493385 0.2216075 0.5860499 0.9994825 0.6118137 0.9998753
Education 0.0073892 0.2810289 0.0009405 0.8614410 0.9899819 0.6872301 0.9971418
EnvironmentSatisfaction 0.2027087 0.3123112 0.0141533 0.0043965 0.9915696 0.0302721 0.9994402
HourlyRate 0.3740712 0.9479455 0.0121477 0.3686677 0.9947282 0.5606605 0.9988406
JobInvolvement 0.7070367 0.1618613 0.3183384 0.7401576 0.9965561 0.1738210 0.9993601
JobSatisfaction 0.1048157 0.0776753 0.3254296 0.5401725 0.9989589 0.6174958 0.9992501
MonthlyIncome 0.0224822 0.2626945 0.0000000 0.0000000 0.9845461 0.0011392 0.9954927
MonthlyRate 0.5427228 0.4894283 0.1768713 0.7061777 0.9925435 0.0915706 0.9986764
NumCompaniesWorked 0.0203893 0.7453919 0.1440189 0.1403865 0.9938553 0.2820852 0.9975385
PercentSalaryHike 0.0143982 0.0682091 0.3278690 0.0632794 0.9957684 0.9146964 0.9990989
PerformanceRating 0.0648621 0.7289152 0.9049755 0.0154966 0.9956879 0.4222443 0.9979472
RelationshipSatisfaction 0.0000869 0.5796037 0.8065068 0.0524516 0.9885338 0.0024727 0.9991710
StockOptionLevel 0.1236563 0.8992230 0.2760117 0.7811183 0.9981700 0.8869781 0.9998946
TotalWorkingYears 0.6211428 0.8135194 0.3365269 0.5438628 0.9984232 0.0000230 0.9996878
TrainingTimesLastYear 0.0808694 0.6323765 0.1939522 0.1884160 0.9937384 0.1884307 0.9993732
WorkLifeBalance 0.0645333 0.5504761 0.3950650 0.6004307 0.9986976 0.3066064 0.9982452
YearsAtCompany 0.5449955 0.3139114 0.0648560 0.7427766 0.9946482 0.9130788 0.9986172
YearsInCurrentRole 0.0901148 0.1030673 0.0498326 0.7287665 0.9974437 0.8042547 0.9981619
YearsSinceLastPromotion 0.2196358 0.3914967 0.9653856 0.3582245 0.9985325 0.4563314 0.9998616
YearsWithCurrManager 0.9564791 0.6897727 0.0235609 0.3843670 0.9956744 0.0044464 0.9987819
BusinessTravel_Travel_Rarely 0.4988297 0.1195384 0.0095628 0.0568347 0.9991386 0.1092909 0.9987905
BusinessTravel_Travel_Frequently 0.0291885 0.4796807 0.3334086 0.9489262 0.9974423 0.4305227 0.9997462
Gender_Female 0.2969850 0.1212829 0.0529954 0.0003685 0.9934974 0.1012798 0.9970246
MaritalStatus_Married 0.9873648 0.2143762 0.1368206 0.3113332 0.9988253 0.6662205 0.9996602
MaritalStatus_Divorced 0.9879489 0.8211764 0.4981004 0.7711347 0.9991421 0.3085217 0.9988060
OverTime_No 0.0034983 0.3745554 0.5164776 0.3319150 0.9925320 0.1718642 0.9996636
# kable output for the Gender by Job Role glm
Gender_Model  %>%  kable() %>% kable_styling(bootstrap_options = "striped", full_width = F) %>% scroll_box(width = "500px", height = "450px")
Gender
(Intercept) 0.0922882
JobRole_Human Resources 0.3306074
JobRole_Manufacturing Director 0.3235260
JobRole_Research Scientist 0.4690106
JobRole_Laboratory Technician 0.1812029
JobRole_Research Director 0.3882230
JobRole_Sales Executive 0.5296589
JobRole_Sales Representative 0.9953475
JobRole_Manager 0.5411048
# kable output for the Martial Status Role glm
Marital_Model  %>%  kable() %>% kable_styling(bootstrap_options = "striped", full_width = F) %>% scroll_box(width = "500px", height = "450px")
Marital Status
(Intercept) 0.0000014
JobRole_Human Resources 0.9587634
JobRole_Manufacturing Director 0.9664449
JobRole_Research Scientist 0.5597292
JobRole_Laboratory Technician 0.5325639
JobRole_Research Director 0.4853543
JobRole_Sales Executive 0.7642466
JobRole_Sales Representative 0.0541812
JobRole_Manager 0.5038035

KNN Model

Running the full KNN model using the training and test data set. The full KNN model came out to have a high accuracy rate of 84%, from there we decided to run the KNN model by job role. The glm showed us that each job has different variables of significance, so the KNN by job reflects different variables that pertains to that specific role.

As a key note, KNN works better with larger data sets than splitting them into job positions.

# KNN
set.seed(123)
#knn.train = train(Attrition~., data=emp_train[,c(cols.CatKNN)], method="knn", trControl=control, tuneGrid=grid1)
knn.train = train(Attrition~., data=emp_train[,c(cols.CatKNN)], method="knn")
knn.train
## k-Nearest Neighbors 
## 
## 1170 samples
##   56 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1170, 1170, 1170, 1170, 1170, 1170, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa      
##   5  0.7782905  0.008056618
##   7  0.7956919  0.005752565
##   9  0.8099105  0.010861496
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
#Set K=18 sq of 1480
knn.test = knn(emp_train[,c(cols.CatKNN)][,-3], emp_test[,c(cols.CatKNN)][,-3], emp_train[,c(cols.CatKNN)][,3], k=18)
knnPrediction <-confusionMatrix(table(knn.test, emp_test$Attrition))
knnPrediction
## Confusion Matrix and Statistics
## 
##         
## knn.test  No Yes
##      No  251  48
##      Yes   0   1
##                                           
##                Accuracy : 0.84            
##                  95% CI : (0.7935, 0.8796)
##     No Information Rate : 0.8367          
##     P-Value [Acc > NIR] : 0.4758          
##                                           
##                   Kappa : 0.0337          
##  Mcnemar's Test P-Value : 1.17e-11        
##                                           
##             Sensitivity : 1.00000         
##             Specificity : 0.02041         
##          Pos Pred Value : 0.83946         
##          Neg Pred Value : 1.00000         
##              Prevalence : 0.83667         
##          Detection Rate : 0.83667         
##    Detection Prevalence : 0.99667         
##       Balanced Accuracy : 0.51020         
##                                           
##        'Positive' Class : No              
## 
fourfoldplot(knnPrediction$table)

KNN Weighted

Running the Weighted KNN model using the training and test data set. The Weighted KNN model came out to have a higher accuracy rate of 84.4 than the KNN which was 84%. Additional, the plot below shows the optimal K which is 30.

# K Weighted
set.seed(123)
#performs leave-one-out crossvalidation 
kknn.train = train.kknn(Attrition~., data=emp_train[,c(cols.CatKNN)], kmax=30, distance = 2)
#Predict Attribution
prediction <- predict(kknn.train, emp_test[,c(cols.CatKNN)][,-3])
#Show Confusion Matrix
kWeightedPrediction <- confusionMatrix(table(prediction, emp_test[,c(cols.CatKNN)][,3]))
kWeightedPrediction
## Confusion Matrix and Statistics
## 
##           
## prediction  No Yes
##        No  251  47
##        Yes   0   2
##                                           
##                Accuracy : 0.8433          
##                  95% CI : (0.7972, 0.8826)
##     No Information Rate : 0.8367          
##     P-Value [Acc > NIR] : 0.4139          
##                                           
##                   Kappa : 0.0665          
##  Mcnemar's Test P-Value : 1.949e-11       
##                                           
##             Sensitivity : 1.00000         
##             Specificity : 0.04082         
##          Pos Pred Value : 0.84228         
##          Neg Pred Value : 1.00000         
##              Prevalence : 0.83667         
##          Detection Rate : 0.83667         
##    Detection Prevalence : 0.99333         
##       Balanced Accuracy : 0.52041         
##                                           
##        'Positive' Class : No              
## 
fourfoldplot(kWeightedPrediction$table)

# Plot Prediction for number of K
graphics.off() 
par(mar=c(5,5,5,5))
plot(kknn.train)

Logistic_Regression

The following model is logistic regression the test and training set for all the fields. For logistic regression to work, the data was formatted to numeric data and setup with a prediction interval in which we converted a probability. In this model, we predicted at an 87% rate.

# Logistic Regression (No Lasso) - Winning Model
#Base Model
glm_model <- glm(emp_train$Attrition~.,data = emp_train[,c(cols.CatGLM)], family = binomial)
summary(glm_model)
## 
## Call:
## glm(formula = emp_train$Attrition ~ ., family = binomial, data = emp_train[, 
##     c(cols.CatGLM)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5831  -0.4893  -0.2395  -0.0824   3.2351  
## 
## Coefficients: (3 not defined because of singularities)
##                                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)                      -1.254e+01  6.957e+02  -0.018 0.985624
## ID                                9.763e-05  2.958e-04   0.330 0.741323
## Age                              -1.917e-02  1.523e-02  -1.259 0.208184
## BusinessTravelTravel_Frequently   1.858e+00  4.663e-01   3.985 6.74e-05
## BusinessTravelTravel_Rarely       9.574e-01  4.292e-01   2.231 0.025704
## DailyRate                        -3.949e-04  2.497e-04  -1.581 0.113774
## DepartmentResearch & Development  1.388e+01  6.957e+02   0.020 0.984083
## DepartmentSales                   1.256e+01  6.957e+02   0.018 0.985594
## DistanceFromHome                  4.361e-02  1.213e-02   3.595 0.000324
## Education                         2.211e-02  1.009e-01   0.219 0.826621
## EducationFieldLife Sciences      -3.611e-02  1.154e+00  -0.031 0.975033
## EducationFieldMarketing           2.371e-01  1.203e+00   0.197 0.843799
## EducationFieldMedical            -1.310e-01  1.157e+00  -0.113 0.909880
## EducationFieldOther              -4.750e-01  1.216e+00  -0.391 0.696003
## EducationFieldTechnical Degree    8.451e-01  1.165e+00   0.725 0.468361
## EmployeeCount                            NA         NA      NA       NA
## EmployeeNumber                   -2.082e-05  1.704e-04  -0.122 0.902721
## EnvironmentSatisfaction          -4.711e-01  9.506e-02  -4.956 7.19e-07
## GenderMale                        3.846e-01  2.092e-01   1.838 0.066038
## HourlyRate                        6.008e-03  5.046e-03   1.191 0.233741
## JobInvolvement                   -6.303e-01  1.392e-01  -4.526 6.00e-06
## JobLevel                         -3.281e-01  3.454e-01  -0.950 0.342197
## JobRoleHuman Resources            1.484e+01  6.957e+02   0.021 0.982979
## JobRoleLaboratory Technician      1.284e+00  5.151e-01   2.492 0.012707
## JobRoleManager                    1.561e-01  9.205e-01   0.170 0.865340
## JobRoleManufacturing Director    -7.803e-02  5.739e-01  -0.136 0.891857
## JobRoleResearch Director         -2.824e+00  1.339e+00  -2.109 0.034961
## JobRoleResearch Scientist         9.597e-02  5.307e-01   0.181 0.856513
## JobRoleSales Executive            2.046e+00  1.376e+00   1.486 0.137207
## JobRoleSales Representative       3.097e+00  1.427e+00   2.170 0.029993
## JobSatisfaction                  -3.917e-01  9.164e-02  -4.274 1.92e-05
## MaritalStatusMarried              5.912e-01  3.146e-01   1.879 0.060186
## MaritalStatusSingle               1.334e+00  3.984e-01   3.348 0.000814
## MonthlyIncome                     1.129e-04  8.975e-05   1.258 0.208543
## MonthlyRate                      -1.078e-05  1.423e-05  -0.758 0.448472
## NumCompaniesWorked                2.006e-01  4.399e-02   4.560 5.12e-06
## Over18_Y                                 NA         NA      NA       NA
## OverTimeYes                       1.959e+00  2.216e-01   8.841  < 2e-16
## PercentSalaryHike                -1.991e-02  4.505e-02  -0.442 0.658572
## PerformanceRating                 1.509e-01  4.586e-01   0.329 0.742089
## RelationshipSatisfaction         -1.950e-01  9.319e-02  -2.093 0.036380
## StandardHours                            NA         NA      NA       NA
## StockOptionLevel                 -1.985e-01  1.740e-01  -1.141 0.254050
## TotalWorkingYears                -8.257e-02  3.364e-02  -2.455 0.014102
## TrainingTimesLastYear            -1.768e-01  8.321e-02  -2.125 0.033591
## WorkLifeBalance                  -4.776e-01  1.400e-01  -3.411 0.000648
## YearsAtCompany                    1.105e-01  4.333e-02   2.549 0.010793
## YearsInCurrentRole               -1.479e-01  5.056e-02  -2.926 0.003432
## YearsSinceLastPromotion           1.862e-01  4.763e-02   3.909 9.25e-05
## YearsWithCurrManager             -1.237e-01  4.987e-02  -2.481 0.013111
##                                     
## (Intercept)                         
## ID                                  
## Age                                 
## BusinessTravelTravel_Frequently  ***
## BusinessTravelTravel_Rarely      *  
## DailyRate                           
## DepartmentResearch & Development    
## DepartmentSales                     
## DistanceFromHome                 ***
## Education                           
## EducationFieldLife Sciences         
## EducationFieldMarketing             
## EducationFieldMedical               
## EducationFieldOther                 
## EducationFieldTechnical Degree      
## EmployeeCount                       
## EmployeeNumber                      
## EnvironmentSatisfaction          ***
## GenderMale                       .  
## HourlyRate                          
## JobInvolvement                   ***
## JobLevel                            
## JobRoleHuman Resources              
## JobRoleLaboratory Technician     *  
## JobRoleManager                      
## JobRoleManufacturing Director       
## JobRoleResearch Director         *  
## JobRoleResearch Scientist           
## JobRoleSales Executive              
## JobRoleSales Representative      *  
## JobSatisfaction                  ***
## MaritalStatusMarried             .  
## MaritalStatusSingle              ***
## MonthlyIncome                       
## MonthlyRate                         
## NumCompaniesWorked               ***
## Over18_Y                            
## OverTimeYes                      ***
## PercentSalaryHike                   
## PerformanceRating                   
## RelationshipSatisfaction         *  
## StandardHours                       
## StockOptionLevel                    
## TotalWorkingYears                *  
## TrainingTimesLastYear            *  
## WorkLifeBalance                  ***
## YearsAtCompany                   *  
## YearsInCurrentRole               ** 
## YearsSinceLastPromotion          ***
## YearsWithCurrManager             *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1031.48  on 1169  degrees of freedom
## Residual deviance:  674.24  on 1123  degrees of freedom
## AIC: 768.24
## 
## Number of Fisher Scoring iterations: 15
#predict probabilities on testset
#type="response" gives probabilities, type="class" gives class
glm_prob <- predict.glm(glm_model,emp_test[,-3],type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
#which classes do these probabilities refer to? What are 1 and 0?
contrasts(emp_test$Attrition)
##     Yes
## No    0
## Yes   1
#make predictions
##.first create vector to hold predictions (we know 0 refers to neg now)
dfTrain <- rep("No",nrow(emp_test))
dfTrain[glm_prob>.5] <- "Yes"
#confusion matrix
LogRegOnly <-confusionMatrix(table(pred=dfTrain,true=emp_test$Attrition))
LogRegOnly
## Confusion Matrix and Statistics
## 
##      true
## pred   No Yes
##   No  244  32
##   Yes   7  17
##                                           
##                Accuracy : 0.87            
##                  95% CI : (0.8266, 0.9059)
##     No Information Rate : 0.8367          
##     P-Value [Acc > NIR] : 0.0658297       
##                                           
##                   Kappa : 0.4015          
##  Mcnemar's Test P-Value : 0.0001215       
##                                           
##             Sensitivity : 0.9721          
##             Specificity : 0.3469          
##          Pos Pred Value : 0.8841          
##          Neg Pred Value : 0.7083          
##              Prevalence : 0.8367          
##          Detection Rate : 0.8133          
##    Detection Prevalence : 0.9200          
##       Balanced Accuracy : 0.6595          
##                                           
##        'Positive' Class : No              
## 
fourfoldplot(LogRegOnly$table)

#Predict for Doctor Bivin - WHAMMMMooooooo
dfPreds = data.frame(emp_test$ID,dfTrain)
colnames(dfPreds) = c("ID","Prediction")
dfPreds
##       ID Prediction
## 1   1171         No
## 2   1172         No
## 3   1173         No
## 4   1174         No
## 5   1175         No
## 6   1176         No
## 7   1177         No
## 8   1178         No
## 9   1179         No
## 10  1180         No
## 11  1181         No
## 12  1182         No
## 13  1183         No
## 14  1184         No
## 15  1185         No
## 16  1186         No
## 17  1187         No
## 18  1188         No
## 19  1189        Yes
## 20  1190         No
## 21  1191         No
## 22  1192         No
## 23  1193         No
## 24  1194         No
## 25  1195         No
## 26  1196        Yes
## 27  1197         No
## 28  1198         No
## 29  1199        Yes
## 30  1200         No
## 31  1201         No
## 32  1202        Yes
## 33  1203         No
## 34  1204         No
## 35  1205         No
## 36  1206         No
## 37  1207         No
## 38  1208         No
## 39  1209         No
## 40  1210         No
## 41  1211         No
## 42  1212         No
## 43  1213         No
## 44  1214         No
## 45  1215         No
## 46  1216         No
## 47  1217        Yes
## 48  1218         No
## 49  1219         No
## 50  1220         No
## 51  1221         No
## 52  1222         No
## 53  1223         No
## 54  1224         No
## 55  1225         No
## 56  1226         No
## 57  1227         No
## 58  1228         No
## 59  1229         No
## 60  1230         No
## 61  1231         No
## 62  1232         No
## 63  1233         No
## 64  1234         No
## 65  1235         No
## 66  1236         No
## 67  1237         No
## 68  1238         No
## 69  1239         No
## 70  1240         No
## 71  1241         No
## 72  1242         No
## 73  1243         No
## 74  1244         No
## 75  1245         No
## 76  1246         No
## 77  1247         No
## 78  1248        Yes
## 79  1249         No
## 80  1250         No
## 81  1251         No
## 82  1252         No
## 83  1253        Yes
## 84  1254         No
## 85  1255         No
## 86  1256         No
## 87  1257         No
## 88  1258         No
## 89  1259         No
## 90  1260         No
## 91  1261         No
## 92  1262         No
## 93  1263         No
## 94  1264         No
## 95  1265         No
## 96  1266         No
## 97  1267         No
## 98  1268        Yes
## 99  1269         No
## 100 1270         No
## 101 1271         No
## 102 1272         No
## 103 1273         No
## 104 1274        Yes
## 105 1275         No
## 106 1276         No
## 107 1277         No
## 108 1278         No
## 109 1279         No
## 110 1280         No
## 111 1281         No
## 112 1282         No
## 113 1283         No
## 114 1284         No
## 115 1285         No
## 116 1286         No
## 117 1287         No
## 118 1288        Yes
## 119 1289         No
## 120 1290         No
## 121 1291         No
## 122 1292        Yes
## 123 1293         No
## 124 1294         No
## 125 1295         No
## 126 1296         No
## 127 1297         No
## 128 1298         No
## 129 1299         No
## 130 1300         No
## 131 1301         No
## 132 1302         No
## 133 1303         No
## 134 1304        Yes
## 135 1305         No
## 136 1306         No
## 137 1307         No
## 138 1308         No
## 139 1309         No
## 140 1310         No
## 141 1311         No
## 142 1312         No
## 143 1313         No
## 144 1314         No
## 145 1315         No
## 146 1316         No
## 147 1317         No
## 148 1318         No
## 149 1319         No
## 150 1320         No
## 151 1321         No
## 152 1322         No
## 153 1323         No
## 154 1324         No
## 155 1325         No
## 156 1326         No
## 157 1327         No
## 158 1328         No
## 159 1329         No
## 160 1330         No
## 161 1331         No
## 162 1332        Yes
## 163 1333         No
## 164 1334        Yes
## 165 1335         No
## 166 1336         No
## 167 1337         No
## 168 1338         No
## 169 1339         No
## 170 1340        Yes
## 171 1341         No
## 172 1342         No
## 173 1343         No
## 174 1344         No
## 175 1345         No
## 176 1346         No
## 177 1347         No
## 178 1348        Yes
## 179 1349        Yes
## 180 1350         No
## 181 1351         No
## 182 1352         No
## 183 1353         No
## 184 1354         No
## 185 1355         No
## 186 1356         No
## 187 1357         No
## 188 1358         No
## 189 1359         No
## 190 1360         No
## 191 1361        Yes
## 192 1362         No
## 193 1363         No
## 194 1364         No
## 195 1365        Yes
## 196 1366         No
## 197 1367         No
## 198 1368         No
## 199 1369         No
## 200 1370         No
## 201 1371         No
## 202 1372         No
## 203 1373         No
## 204 1374         No
## 205 1375         No
## 206 1376         No
## 207 1377         No
## 208 1378         No
## 209 1379         No
## 210 1380         No
## 211 1381         No
## 212 1382         No
## 213 1383         No
## 214 1384         No
## 215 1385         No
## 216 1386         No
## 217 1387         No
## 218 1388         No
## 219 1389         No
## 220 1390         No
## 221 1391         No
## 222 1392         No
## 223 1393         No
## 224 1394         No
## 225 1395         No
## 226 1396         No
## 227 1397         No
## 228 1398         No
## 229 1399         No
## 230 1400        Yes
## 231 1401         No
## 232 1402         No
## 233 1403         No
## 234 1404         No
## 235 1405         No
## 236 1406         No
## 237 1407         No
## 238 1408         No
## 239 1409         No
## 240 1410         No
## 241 1411        Yes
## 242 1412        Yes
## 243 1413         No
## 244 1414         No
## 245 1415         No
## 246 1416         No
## 247 1417         No
## 248 1418         No
## 249 1419         No
## 250 1420         No
## 251 1421         No
## 252 1422         No
## 253 1423         No
## 254 1424         No
## 255 1425         No
## 256 1426         No
## 257 1427         No
## 258 1428         No
## 259 1429         No
## 260 1430         No
## 261 1431         No
## 262 1432         No
## 263 1433         No
## 264 1434         No
## 265 1435         No
## 266 1436        Yes
## 267 1437         No
## 268 1438         No
## 269 1439         No
## 270 1440         No
## 271 1441         No
## 272 1442         No
## 273 1443         No
## 274 1444         No
## 275 1445         No
## 276 1446         No
## 277 1447         No
## 278 1448         No
## 279 1449         No
## 280 1450         No
## 281 1451         No
## 282 1452         No
## 283 1453         No
## 284 1454         No
## 285 1455         No
## 286 1456         No
## 287 1457         No
## 288 1458         No
## 289 1459         No
## 290 1460        Yes
## 291 1461         No
## 292 1462         No
## 293 1463         No
## 294 1464         No
## 295 1465         No
## 296 1466         No
## 297 1467         No
## 298 1468         No
## 299 1469         No
## 300 1470         No
write.csv(dfPreds,file = "LabelPrediction.csv",row.names = FALSE)

Logistic_Regression Using Lasso - For Bonus

Adding to the the Logistic Regression model above, Logistic Regression Using Lasso was used for automated feature selection. As part of this model, we discovered that only 6 variables predict at 86% vs general logistic regression of 87% that utilizes all fields in the data set. Because the Logistic_Regression Using Lasso is more efficient, our recommendation is to run the logistic regression model using LASSO because it is more efficient and only 1% less prediction.

Just because Lasso was used as a feature selection, it is essential to Validate the numbers that come out of the LASSO model. From the Automated LASSO feature selection, it was discovered through down selects that there is a better model.

#Begin Logistic Regression with Lasso
#convert training data to matrix format
set.seed(123)
x <- model.matrix(emp_train$Attrition~.,emp_train[,c(cols.CatGLM)])
y <- ifelse(emp_train$Attrition=="Yes",1,0)
# Run Base Model
glm.lasso.new <- cv.glmnet(x,y,alpha=1,family="binomial",type.measure = "mse")
plot(glm.lasso.new)

#min value of lambda
lambda_min <- glm.lasso.new$lambda.min
#best value of lambda
lambda_1se <- glm.lasso.new$lambda.1se
#regression coefficients
glm.lasso.new.coef <- coef(glm.lasso.new,s=lambda_1se)
data.frame(name = glm.lasso.new.coef@Dimnames[[1]][glm.lasso.new.coef@i + 1], coefficient = glm.lasso.new.coef@x)
##                                name   coefficient
## 1                       (Intercept)  1.5071026023
## 2                               Age -0.0104129673
## 3   BusinessTravelTravel_Frequently  0.5679751924
## 4                         DailyRate -0.0001563157
## 5  DepartmentResearch & Development -0.3175946691
## 6                   DepartmentSales  0.0101192638
## 7                  DistanceFromHome  0.0181403082
## 8           EducationFieldMarketing  0.0664777807
## 9               EducationFieldOther -0.0255696530
## 10   EducationFieldTechnical Degree  0.4419704188
## 11          EnvironmentSatisfaction -0.3050329917
## 12                       GenderMale  0.1548217727
## 13                   JobInvolvement -0.4297487944
## 14     JobRoleLaboratory Technician  0.6614081416
## 15    JobRoleManufacturing Director -0.0282707270
## 16         JobRoleResearch Director -0.5589679652
## 17      JobRoleSales Representative  0.8125101982
## 18                  JobSatisfaction -0.2201139436
## 19              MaritalStatusSingle  0.6026889077
## 20               NumCompaniesWorked  0.0890748733
## 21                      OverTimeYes  1.3690621543
## 22         RelationshipSatisfaction -0.0564621601
## 23                 StockOptionLevel -0.1346097156
## 24                TotalWorkingYears -0.0324705552
## 25            TrainingTimesLastYear -0.0647294624
## 26                  WorkLifeBalance -0.2452690840
## 27               YearsInCurrentRole -0.0427828135
## 28          YearsSinceLastPromotion  0.0753910193
## 29             YearsWithCurrManager -0.0263903970
# Get column indecis
cols.lasso.coef <- glm.lasso.new.coef@i
cols.lasso.coef <- cols.lasso.coef[-1] # Remove the intercept
train.reduce = emp_train[,cols.lasso.coef]
train.reduce = train.reduce[,-20]
#Assess Model
glm.assess <- glm(Attrition~.,data = train.reduce, family = "binomial")
summary(glm.assess)
## 
## Call:
## glm(formula = Attrition ~ ., family = "binomial", data = train.reduce)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7261  -0.5498  -0.3276  -0.1633   3.0145  
## 
## Coefficients: (8 not defined because of singularities)
##                                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)                          1.633e+00  1.450e+00   1.126 0.259981
## BusinessTravelTravel_Frequently      1.657e+00  4.328e-01   3.829 0.000129
## BusinessTravelTravel_Rarely          8.315e-01  4.039e-01   2.059 0.039535
## DepartmentResearch & Development     3.979e-03  6.060e-01   0.007 0.994761
## DepartmentSales                      4.911e-01  6.244e-01   0.787 0.431563
## DistanceFromHome                     3.036e-02  1.087e-02   2.792 0.005236
## Education                            1.738e-02  9.186e-02   0.189 0.849927
## EducationFieldLife Sciences         -4.156e-01  1.051e+00  -0.395 0.692639
## EducationFieldMarketing             -3.083e-01  1.096e+00  -0.281 0.778580
## EducationFieldMedical               -4.610e-01  1.058e+00  -0.436 0.662936
## EducationFieldOther                 -8.971e-01  1.116e+00  -0.804 0.421661
## EducationFieldTechnical Degree       3.297e-01  1.065e+00   0.310 0.756888
## EnvironmentSatisfaction             -4.162e-01  8.755e-02  -4.754 2.00e-06
## HourlyRate                           1.995e-03  4.623e-03   0.432 0.666020
## JobInvolvement                      -6.546e-01  1.289e-01  -5.078 3.82e-07
## JobSatisfaction                     -3.717e-01  8.411e-02  -4.419 9.91e-06
## MaritalStatusMarried                 5.398e-01  2.858e-01   1.889 0.058957
## MaritalStatusSingle                  1.436e+00  2.860e-01   5.019 5.19e-07
## MonthlyRate                         -9.948e-06  1.305e-05  -0.762 0.445887
## OverTimeYes                          1.655e+00  1.955e-01   8.467  < 2e-16
## PerformanceRating                   -7.501e-02  2.627e-01  -0.286 0.775196
## RelationshipSatisfaction            -1.630e-01  8.535e-02  -1.909 0.056247
## TotalWorkingYears                   -7.401e-02  1.935e-02  -3.825 0.000131
## TrainingTimesLastYear               -1.861e-01  7.646e-02  -2.435 0.014907
## YearsAtCompany                       5.778e-02  3.219e-02   1.795 0.072654
## YearsWithCurrManager                -1.336e-01  4.629e-02  -2.885 0.003908
## `BusinessTravel_Non-Travel`                 NA         NA      NA       NA
## `Department_Human Resources`                NA         NA      NA       NA
## `Department_Research & Development`         NA         NA      NA       NA
## Department_Sales                            NA         NA      NA       NA
## `EducationField_Human Resources`            NA         NA      NA       NA
## EducationField_Medical                      NA         NA      NA       NA
## `EducationField_Life Sciences`              NA         NA      NA       NA
## EducationField_Other                        NA         NA      NA       NA
##                                        
## (Intercept)                            
## BusinessTravelTravel_Frequently     ***
## BusinessTravelTravel_Rarely         *  
## DepartmentResearch & Development       
## DepartmentSales                        
## DistanceFromHome                    ** 
## Education                              
## EducationFieldLife Sciences            
## EducationFieldMarketing                
## EducationFieldMedical                  
## EducationFieldOther                    
## EducationFieldTechnical Degree         
## EnvironmentSatisfaction             ***
## HourlyRate                             
## JobInvolvement                      ***
## JobSatisfaction                     ***
## MaritalStatusMarried                .  
## MaritalStatusSingle                 ***
## MonthlyRate                            
## OverTimeYes                         ***
## PerformanceRating                      
## RelationshipSatisfaction            .  
## TotalWorkingYears                   ***
## TrainingTimesLastYear               *  
## YearsAtCompany                      .  
## YearsWithCurrManager                ** 
## `BusinessTravel_Non-Travel`            
## `Department_Human Resources`           
## `Department_Research & Development`    
## Department_Sales                       
## `EducationField_Human Resources`       
## EducationField_Medical                 
## `EducationField_Life Sciences`         
## EducationField_Other                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1031.48  on 1169  degrees of freedom
## Residual deviance:  773.81  on 1144  degrees of freedom
## AIC: 825.81
## 
## Number of Fisher Scoring iterations: 6
# Remove Non-Sign Variables
index <- c(2,4,7,9,10,11,13,15:19)
train.reduce.final <- train.reduce[,index]
glm.finalversion <- glm(train.reduce$Attrition~.,data = train.reduce.final, family = "binomial")
summary(glm.finalversion)
## 
## Call:
## glm(formula = train.reduce$Attrition ~ ., family = "binomial", 
##     data = train.reduce.final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7065  -0.5495  -0.3416  -0.1739   3.0886  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      1.26617    0.70396   1.799 0.072075 .  
## BusinessTravelTravel_Frequently  1.62937    0.42961   3.793 0.000149 ***
## BusinessTravelTravel_Rarely      0.83255    0.40206   2.071 0.038387 *  
## DistanceFromHome                 0.02919    0.01074   2.717 0.006578 ** 
## EnvironmentSatisfaction         -0.41460    0.08516  -4.868 1.12e-06 ***
## JobInvolvement                  -0.65002    0.12591  -5.162 2.44e-07 ***
## JobSatisfaction                 -0.37959    0.08266  -4.592 4.39e-06 ***
## MaritalStatusMarried             0.54855    0.28311   1.938 0.052680 .  
## MaritalStatusSingle              1.44335    0.28297   5.101 3.38e-07 ***
## OverTimeYes                      1.58614    0.19070   8.318  < 2e-16 ***
## RelationshipSatisfaction        -0.16695    0.08406  -1.986 0.047030 *  
## TotalWorkingYears               -0.07556    0.01896  -3.985 6.74e-05 ***
## TrainingTimesLastYear           -0.17441    0.07491  -2.328 0.019894 *  
## YearsAtCompany                   0.05781    0.03161   1.829 0.067389 .  
## YearsWithCurrManager            -0.12754    0.04492  -2.839 0.004519 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1031.48  on 1169  degrees of freedom
## Residual deviance:  790.81  on 1155  degrees of freedom
## AIC: 820.81
## 
## Number of Fisher Scoring iterations: 6
#Remove Monthly Rate
train.reduce.final.version <- train.reduce.final[,-6]
#Reassess Model 
glm.finalfinal <- glm(train.reduce$Attrition~.,data = train.reduce.final.version, family = "binomial")
summary(glm.finalfinal)
## 
## Call:
## glm(formula = train.reduce$Attrition ~ ., family = "binomial", 
##     data = train.reduce.final.version)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7758  -0.5742  -0.3691  -0.2011   3.0544  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      2.10903    0.64902   3.250  0.00116 ** 
## BusinessTravelTravel_Frequently  1.65976    0.41250   4.024 5.73e-05 ***
## BusinessTravelTravel_Rarely      0.83381    0.38527   2.164  0.03045 *  
## DistanceFromHome                 0.02510    0.01045   2.402  0.01629 *  
## EnvironmentSatisfaction         -0.42533    0.08323  -5.110 3.22e-07 ***
## JobInvolvement                  -0.66063    0.12169  -5.429 5.68e-08 ***
## JobSatisfaction                 -0.35906    0.08072  -4.448 8.67e-06 ***
## OverTimeYes                      1.51959    0.18471   8.227  < 2e-16 ***
## RelationshipSatisfaction        -0.15570    0.08204  -1.898  0.05771 .  
## TotalWorkingYears               -0.07788    0.01879  -4.144 3.41e-05 ***
## TrainingTimesLastYear           -0.16452    0.07313  -2.250  0.02447 *  
## YearsAtCompany                   0.04605    0.03081   1.494  0.13509    
## YearsWithCurrManager            -0.11105    0.04356  -2.550  0.01079 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1031.48  on 1169  degrees of freedom
## Residual deviance:  826.83  on 1157  degrees of freedom
## AIC: 852.83
## 
## Number of Fisher Scoring iterations: 5
glm.finalfinal$coefficients
##                     (Intercept) BusinessTravelTravel_Frequently 
##                      2.10902537                      1.65976133 
##     BusinessTravelTravel_Rarely                DistanceFromHome 
##                      0.83381424                      0.02510475 
##         EnvironmentSatisfaction                  JobInvolvement 
##                     -0.42532783                     -0.66062915 
##                 JobSatisfaction                     OverTimeYes 
##                     -0.35905872                      1.51959223 
##        RelationshipSatisfaction               TotalWorkingYears 
##                     -0.15569624                     -0.07788303 
##           TrainingTimesLastYear                  YearsAtCompany 
##                     -0.16451544                      0.04604716 
##            YearsWithCurrManager 
##                     -0.11105360
exp(cbind(coef(glm.finalfinal), confint(glm.finalfinal)))
## Waiting for profiling to be done...
##                                               2.5 %     97.5 %
## (Intercept)                     8.2402062 2.2938743 29.4491357
## BusinessTravelTravel_Frequently 5.2580558 2.4356727 12.4089926
## BusinessTravelTravel_Rarely     2.3020827 1.1300236  5.1772887
## DistanceFromHome                1.0254225 1.0044848  1.0465530
## EnvironmentSatisfaction         0.6535555 0.5542340  0.7683807
## JobInvolvement                  0.5165263 0.4059855  0.6545741
## JobSatisfaction                 0.6983333 0.5953772  0.8173264
## OverTimeYes                     4.5703612 3.1912666  6.5891055
## RelationshipSatisfaction        0.8558191 0.7284723  1.0052186
## TotalWorkingYears               0.9250726 0.8900473  0.9582470
## TrainingTimesLastYear           0.8483047 0.7334475  0.9772964
## YearsAtCompany                  1.0471238 0.9838192  1.1109003
## YearsWithCurrManager            0.8948908 0.8222133  0.9757138
# Test Model on Test Dataset
test <- emp_test
test$final.prob <- predict.glm(glm.finalfinal,test[,-3],type="response")
test$final.predicted <- ifelse(test$final.prob>.5,"Yes","No")
Lassofinal <- confusionMatrix(table(test$final.predicted, test$Attrition))
Lassofinal
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  246  37
##   Yes   5  12
##                                           
##                Accuracy : 0.86            
##                  95% CI : (0.8155, 0.8972)
##     No Information Rate : 0.8367          
##     P-Value [Acc > NIR] : 0.1548          
##                                           
##                   Kappa : 0.3052          
##  Mcnemar's Test P-Value : 1.724e-06       
##                                           
##             Sensitivity : 0.9801          
##             Specificity : 0.2449          
##          Pos Pred Value : 0.8693          
##          Neg Pred Value : 0.7059          
##              Prevalence : 0.8367          
##          Detection Rate : 0.8200          
##    Detection Prevalence : 0.9433          
##       Balanced Accuracy : 0.6125          
##                                           
##        'Positive' Class : No              
## 
fourfoldplot(Lassofinal$table)

Summary Explanation:

The following table outlines the four different models that were used in accuracy order. As you can tell, logistic regression was the most accurate; however, we recommend using the logistic regression using lasso because it is more efficient.

# Prediciton Models
# Review Prediciton Models
LogRegOnly # Log Regression
## Confusion Matrix and Statistics
## 
##      true
## pred   No Yes
##   No  244  32
##   Yes   7  17
##                                           
##                Accuracy : 0.87            
##                  95% CI : (0.8266, 0.9059)
##     No Information Rate : 0.8367          
##     P-Value [Acc > NIR] : 0.0658297       
##                                           
##                   Kappa : 0.4015          
##  Mcnemar's Test P-Value : 0.0001215       
##                                           
##             Sensitivity : 0.9721          
##             Specificity : 0.3469          
##          Pos Pred Value : 0.8841          
##          Neg Pred Value : 0.7083          
##              Prevalence : 0.8367          
##          Detection Rate : 0.8133          
##    Detection Prevalence : 0.9200          
##       Balanced Accuracy : 0.6595          
##                                           
##        'Positive' Class : No              
## 
Lassofinal # LogRessions with Lasso
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  246  37
##   Yes   5  12
##                                           
##                Accuracy : 0.86            
##                  95% CI : (0.8155, 0.8972)
##     No Information Rate : 0.8367          
##     P-Value [Acc > NIR] : 0.1548          
##                                           
##                   Kappa : 0.3052          
##  Mcnemar's Test P-Value : 1.724e-06       
##                                           
##             Sensitivity : 0.9801          
##             Specificity : 0.2449          
##          Pos Pred Value : 0.8693          
##          Neg Pred Value : 0.7059          
##              Prevalence : 0.8367          
##          Detection Rate : 0.8200          
##    Detection Prevalence : 0.9433          
##       Balanced Accuracy : 0.6125          
##                                           
##        'Positive' Class : No              
## 
knnPrediction # kNN 
## Confusion Matrix and Statistics
## 
##         
## knn.test  No Yes
##      No  251  48
##      Yes   0   1
##                                           
##                Accuracy : 0.84            
##                  95% CI : (0.7935, 0.8796)
##     No Information Rate : 0.8367          
##     P-Value [Acc > NIR] : 0.4758          
##                                           
##                   Kappa : 0.0337          
##  Mcnemar's Test P-Value : 1.17e-11        
##                                           
##             Sensitivity : 1.00000         
##             Specificity : 0.02041         
##          Pos Pred Value : 0.83946         
##          Neg Pred Value : 1.00000         
##              Prevalence : 0.83667         
##          Detection Rate : 0.83667         
##    Detection Prevalence : 0.99667         
##       Balanced Accuracy : 0.51020         
##                                           
##        'Positive' Class : No              
## 
kWeightedPrediction # K Weighted
## Confusion Matrix and Statistics
## 
##           
## prediction  No Yes
##        No  251  47
##        Yes   0   2
##                                           
##                Accuracy : 0.8433          
##                  95% CI : (0.7972, 0.8826)
##     No Information Rate : 0.8367          
##     P-Value [Acc > NIR] : 0.4139          
##                                           
##                   Kappa : 0.0665          
##  Mcnemar's Test P-Value : 1.949e-11       
##                                           
##             Sensitivity : 1.00000         
##             Specificity : 0.04082         
##          Pos Pred Value : 0.84228         
##          Neg Pred Value : 1.00000         
##              Prevalence : 0.83667         
##          Detection Rate : 0.83667         
##    Detection Prevalence : 0.99333         
##       Balanced Accuracy : 0.52041         
##                                           
##        'Positive' Class : No              
## 
# Create Prediction Summary Table
dt0 <- data.frame(cbind(t(LogRegOnly$overall),t(LogRegOnly$byClass)))
dt0$Type <- as.character("LogRegOnly")
dt1 <- data.frame(cbind(t(knnPrediction$overall),t(knnPrediction$byClass)))
dt1$Type <- as.character("kNN")
dt3 <- data.frame(cbind(t(kWeightedPrediction$overall),t(kWeightedPrediction$byClass)))
dt3$Type <- as.character("kWeighted")
dt4 <- data.frame(cbind(t(Lassofinal$overall),t(Lassofinal$byClass)))
dt4$Type <- as.character("Lassofinal")
SummaryPred <-rbind(dt0, dt1, dt3, dt4)
SummaryPred <- SummaryPred[order(-SummaryPred$Accuracy),]
SummaryPred <- SummaryPred[,c(19,1:18)]
SummaryT <- SummaryPred[,c(1,2,9, 10)]
#SummaryPred  %>%  kable() %>% kable_styling(bootstrap_options = "striped", full_width = F) %>% scroll_box(width = "100%", height = "200px")
SummaryT  %>%  kable() %>% kable_styling(bootstrap_options = "striped", full_width = F) %>% scroll_box(width = "100%", height = "200px")
Type Accuracy Sensitivity Specificity
1 LogRegOnly 0.8700000 0.9721116 0.3469388
4 Lassofinal 0.8600000 0.9800797 0.2448980
3 kWeighted 0.8433333 1.0000000 0.0408163
2 kNN 0.8400000 1.0000000 0.0204082
#dfTrain <- read.csv("E:/Documents/School/MSDS 6306/Case Study 2/CaseStudy2-data.csv", na.strings = "Null")
#dfVal <- read.csv("E:/Documents/School/MSDS 6306/Case Study 2/CaseStudy2Validation.csv", na.strings = "Null")
#Grant Path
#dfTrain <- read.csv("CaseStudy2-data.csv", na.strings = "NULL")
#dfVal <- read.csv("CaseStudy2Validation.csv", na.strings = "NULL")

#dfTrain2 <- fastDummies::dummy_cols(dfTrain) # Create Dummy Variables
#dfVal2 <- fastDummies::dummy_cols(dfVal) # Create Dummy

#Col.KNN <- c(2,5,7,8,10,11,12,14,15,16,18,20,21,22,25,26,27,28,29,30,31,32,33,34,35,36,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68)

# KNN
#set.seed(1)
#fit = train(Attrition~., data=dfTrain2[,c(cols.CatKNN)], method="knn")
#dfVal$Predictions = predict(fit,dfVal2)

#dfPreds = data.frame(dfVal$ID,dfVal$Predictions)
#colnames(dfPreds) = c("ID","Prediction")
#dfPreds

#write.csv(dfPreds,file = "LabelPrediction.csv",row.names = FALSE)

Appendix A - Reference Materal